{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "gensim\n",
    "http://nbviewer.jupyter.org/gist/boskaiolo/cc3e1341f59bfbd02726\n",
    "http://blog.csdn.net/u010297828/article/details/50464845\n",
    "\n",
    "\n",
    "\n",
    "sklearn\n",
    "http://www.cnblogs.com/pinard/p/6908150.html\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Downloading 20news dataset. This may take a few minutes.\n",
      "INFO : Downloading 20news dataset. This may take a few minutes.\n",
      "Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)\n",
      "INFO : Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)\n"
     ]
    }
   ],
   "source": [
    "from sklearn import datasets\n",
    "news_dataset = datasets.fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "In the dataset there are 18846 textual documents\n",
      "And this is the first one:\n",
      "\n",
      "\n",
      "I am sure some bashers of Pens fans are pretty confused about the lack\n",
      "of any kind of posts about the recent Pens massacre of the Devils. Actually,\n",
      "I am  bit puzzled too and a bit relieved. However, I am going to put an end\n",
      "to non-PIttsburghers' relief with a bit of praise for the Pens. Man, they\n",
      "are killing those Devils worse than I thought. Jagr just showed you why\n",
      "he is much better than his regular season stats. He is also a lot\n",
      "fo fun to watch in the playoffs. Bowman should let JAgr have a lot of\n",
      "fun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final\n",
      "regular season game.          PENS RULE!!!\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# A list of text document is contained in the data variable\n",
    "documents = news_dataset.data\n",
    "\n",
    "print \"In the dataset there are\", len(documents), \"textual documents\"\n",
    "print \"And this is the first one:\\n\", documents[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "import gensim\n",
    "from gensim.utils import simple_preprocess\n",
    "from gensim.parsing.preprocessing import STOPWORDS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# gensim.parsing.preprocessing.STOPWORDS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "After the tokenizer, the previous document becomes:\n",
      "[u'sure', u'bashers', u'pens', u'fans', u'pretty', u'confused', u'lack', u'kind', u'posts', u'recent', u'pens', u'massacre', u'devils', u'actually', u'bit', u'puzzled', u'bit', u'relieved', u'going', u'end', u'non', u'pittsburghers', u'relief', u'bit', u'praise', u'pens', u'man', u'killing', u'devils', u'worse', u'thought', u'jagr', u'showed', u'better', u'regular', u'season', u'stats', u'lot', u'fo', u'fun', u'watch', u'playoffs', u'bowman', u'let', u'jagr', u'lot', u'fun', u'couple', u'games', u'pens', u'going', u'beat', u'pulp', u'jersey', u'disappointed', u'islanders', u'lose', u'final', u'regular', u'season', u'game', u'pens', u'rule']\n"
     ]
    }
   ],
   "source": [
    "def tokenize(text):\n",
    "    return [token for token in gensim.utils.simple_preprocess(text) if token not in gensim.parsing.preprocessing.STOPWORDS]\n",
    "\n",
    "print \"After the tokenizer, the previous document becomes:\\n\", tokenize(documents[0])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "processed_docs = [tokenize(doc) for doc in documents]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "In the corpus there are 95507 unique tokens\n"
     ]
    }
   ],
   "source": [
    "word_count_dict = gensim.corpora.Dictionary(processed_docs)\n",
    "print \"In the corpus there are\", len(word_count_dict), \"unique tokens\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "word_count_dict.filter_extremes(no_below=20, no_above=0.1) # word must appear >10 times, and no more than 20% documents"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "After filtering, in the corpus there are only 8121 unique tokens\n"
     ]
    }
   ],
   "source": [
    "print \"After filtering, in the corpus there are only\", len(word_count_dict), \"unique tokens\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "bag_of_words_corpus = [word_count_dict.doc2bow(pdoc) for pdoc in processed_docs]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bag of words representation of the first document (tuples are composed by token_id and multiplicity):\n",
      "[(219, 1), (770, 2), (780, 2), (1354, 1), (1375, 1), (1568, 1), (1723, 2), (2024, 1), (2699, 1), (3194, 1), (3215, 1), (3353, 1), (3467, 1), (3755, 1), (3853, 1), (3880, 1), (3966, 1), (4213, 1), (4304, 2), (4678, 1), (4703, 1), (4840, 1), (4897, 1), (5001, 1), (5243, 5), (5398, 2), (5405, 1), (5454, 2), (5510, 3), (5694, 1), (5877, 1), (5985, 1), (6212, 1), (6273, 1), (6393, 1), (6437, 1), (6677, 1), (6852, 2), (6885, 1), (7031, 1), (7162, 1), (7185, 1), (7370, 1), (7882, 1)]\n",
      "\n",
      "In the document, topic_id 219 (word \"showed\") appears 1 time[s]\n",
      "In the document, topic_id 770 (word \"jagr\") appears 2 time[s]\n",
      "In the document, topic_id 780 (word \"going\") appears 2 time[s]\n",
      "In the document, topic_id 1354 (word \"recent\") appears 1 time[s]\n",
      "In the document, topic_id 1375 (word \"couple\") appears 1 time[s]\n",
      "...\n"
     ]
    }
   ],
   "source": [
    "bow_doc1 = bag_of_words_corpus[0]\n",
    "\n",
    "print \"Bag of words representation of the first document (tuples are composed by token_id and multiplicity):\\n\", bow_doc1\n",
    "print\n",
    "for i in range(5):\n",
    "    print \"In the document, topic_id {} (word \\\"{}\\\") appears {} time[s]\".format(bow_doc1[i][0], word_count_dict[bow_doc1[i][0]], bow_doc1[i][1])\n",
    "print \"...\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# LDA mono-core\n",
    "lda_model = gensim.models.LdaModel(bag_of_words_corpus, num_topics=10, id2word=word_count_dict, passes=5)\n",
    "\n",
    "# LDA multicore (in this configuration, defaulty, uses n_cores-1)\n",
    "# lda_model = gensim.models.LdaMulticore(bag_of_words_corpus, num_topics=10, id2word=word_count_dict, passes=5)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(5, 0.023679503768661413),\n",
       " (6, 0.41431114779458444),\n",
       " (7, 0.017291639785003393),\n",
       " (8, 0.51751020260211666),\n",
       " (9, 0.018585604973863107)]"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lda_model[bag_of_words_corpus[0]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(8, 0.51750670596247794),\n",
       " (6, 0.41432467490281838),\n",
       " (5, 0.023679370917892333),\n",
       " (9, 0.01858703155864945),\n",
       " (7, 0.017280315593002134)]"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sorted(lda_model[bag_of_words_corpus[0]], key=lambda tup: -1*tup[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "u\"\\n\\nI am sure some bashers of Pens fans are pretty confused about the lack\\nof any kind of posts about the recent Pens massacre of the Devils. Actually,\\nI am  bit puzzled too and a bit relieved. However, I am going to put an end\\nto non-PIttsburghers' relief with a bit of praise for the Pens. Man, they\\nare killing those Devils worse than I thought. Jagr just showed you why\\nhe is much better than his regular season stats. He is also a lot\\nfo fun to watch in the playoffs. Bowman should let JAgr have a lot of\\nfun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final\\nregular season game.          PENS RULE!!!\\n\\n\""
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    " documents[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Score: 0.517509557652\t Topic: 0.019*\"game\" + 0.016*\"team\" + 0.013*\"games\" + 0.012*\"year\" + 0.009*\"season\" + 0.009*\"hockey\" + 0.009*\"play\" + 0.007*\"league\" + 0.007*\"players\" + 0.006*\"st\"\n",
      "Score: 0.414313056494\t Topic: 0.006*\"going\" + 0.006*\"ll\" + 0.006*\"car\" + 0.006*\"got\" + 0.006*\"said\" + 0.004*\"little\" + 0.004*\"thing\" + 0.004*\"old\" + 0.004*\"lot\" + 0.004*\"sure\"\n",
      "Score: 0.0236794807207\t Topic: 0.049*\"god\" + 0.014*\"jesus\" + 0.012*\"bible\" + 0.011*\"church\" + 0.010*\"christ\" + 0.010*\"christian\" + 0.008*\"man\" + 0.008*\"lord\" + 0.007*\"christians\" + 0.006*\"sin\"\n",
      "Score: 0.0185857822674\t Topic: 0.014*\"db\" + 0.010*\"st\" + 0.010*\"mm\" + 0.008*\"ma\" + 0.008*\"hz\" + 0.007*\"mp\" + 0.007*\"md\" + 0.007*\"ra\" + 0.007*\"mv\" + 0.007*\"lc\"\n",
      "Score: 0.0172902217921\t Topic: 0.012*\"government\" + 0.009*\"key\" + 0.007*\"law\" + 0.006*\"public\" + 0.005*\"president\" + 0.005*\"mr\" + 0.005*\"war\" + 0.005*\"gun\" + 0.005*\"state\" + 0.005*\"armenian\"\n"
     ]
    }
   ],
   "source": [
    "for index, score in sorted(lda_model[bag_of_words_corpus[0]], key=lambda tup: -1*tup[1]):\n",
    "    print \"Score: {}\\t Topic: {}\".format(score, lda_model.print_topic(index, 10))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'rec.sport.hockey'"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "news_dataset.target_names[news_dataset.target[0]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Score: 0.792241842452\t Topic: 0.011*\"windows\" + 0.009*\"dos\" + 0.008*\"file\" + 0.007*\"image\" + 0.007*\"bit\" + 0.006*\"drive\" + 0.006*\"program\" + 0.006*\"card\" + 0.005*\"disk\" + 0.005*\"software\"\n",
      "Score: 0.13329362145\t Topic: 0.026*\"edu\" + 0.015*\"mail\" + 0.014*\"com\" + 0.009*\"available\" + 0.009*\"ftp\" + 0.009*\"send\" + 0.008*\"information\" + 0.008*\"email\" + 0.008*\"list\" + 0.006*\"pub\"\n",
      "Score: 0.0532479917408\t Topic: 0.006*\"going\" + 0.006*\"ll\" + 0.006*\"car\" + 0.006*\"got\" + 0.006*\"said\" + 0.004*\"little\" + 0.004*\"thing\" + 0.004*\"old\" + 0.004*\"lot\" + 0.004*\"sure\"\n"
     ]
    }
   ],
   "source": [
    "for index, score in sorted(lda_model[bag_of_words_corpus[1]], key=lambda tup: -1*tup[1]):\n",
    "    print \"Score: {}\\t Topic: {}\".format(score, lda_model.print_topic(index, 10))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The unseen document is composed by the following text: In my spare time I either play badmington or drive my car\n",
      "\n",
      "Score: 0.630699929351\t Topic: 0.006*\"going\" + 0.006*\"ll\" + 0.006*\"car\" + 0.006*\"got\" + 0.006*\"said\"\n",
      "Score: 0.209286907818\t Topic: 0.019*\"game\" + 0.016*\"team\" + 0.013*\"games\" + 0.012*\"year\" + 0.009*\"season\"\n",
      "Score: 0.0200124118895\t Topic: 0.011*\"windows\" + 0.009*\"dos\" + 0.008*\"file\" + 0.007*\"image\" + 0.007*\"bit\"\n",
      "Score: 0.0200003248592\t Topic: 0.006*\"believe\" + 0.005*\"point\" + 0.005*\"evidence\" + 0.005*\"question\" + 0.004*\"israel\"\n",
      "Score: 0.0200001895866\t Topic: 0.011*\"space\" + 0.005*\"program\" + 0.005*\"high\" + 0.005*\"research\" + 0.004*\"earth\"\n",
      "Score: 0.0200001686062\t Topic: 0.012*\"government\" + 0.009*\"key\" + 0.007*\"law\" + 0.006*\"public\" + 0.005*\"president\"\n",
      "Score: 0.0200000678415\t Topic: 0.049*\"god\" + 0.014*\"jesus\" + 0.012*\"bible\" + 0.011*\"church\" + 0.010*\"christ\"\n",
      "Score: 0.0200000000377\t Topic: 0.026*\"edu\" + 0.015*\"mail\" + 0.014*\"com\" + 0.009*\"available\" + 0.009*\"ftp\"\n",
      "Score: 0.0200000000058\t Topic: 0.014*\"db\" + 0.010*\"st\" + 0.010*\"mm\" + 0.008*\"ma\" + 0.008*\"hz\"\n",
      "Score: 0.020000000005\t Topic: 0.787*\"ax\" + 0.059*\"max\" + 0.009*\"pl\" + 0.008*\"wm\" + 0.007*\"di\"\n"
     ]
    }
   ],
   "source": [
    "unseen_document = \"In my spare time I either play badmington or drive my car\"\n",
    "print \"The unseen document is composed by the following text:\", unseen_document\n",
    "print\n",
    "\n",
    "bow_vector = word_count_dict.doc2bow(tokenize(unseen_document))\n",
    "for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):\n",
    "    print \"Score: {}\\t Topic: {}\".format(score, lda_model.print_topic(index, 5))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Log perplexity of the model is -7.57361194518\n"
     ]
    }
   ],
   "source": [
    "print \"Log perplexity of the model is\", lda_model.log_perplexity(bag_of_words_corpus)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "ename": "AttributeError",
     "evalue": "'LdaModel' object has no attribute 'components_'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-41-feb4d8607bb0>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mlda_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcomponents_\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;31mAttributeError\u001b[0m: 'LdaModel' object has no attribute 'components_'"
     ]
    }
   ],
   "source": [
    "lda_model.components_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(0,\n",
       "  u'0.787*\"ax\" + 0.059*\"max\" + 0.009*\"pl\" + 0.008*\"wm\" + 0.007*\"di\" + 0.007*\"tm\" + 0.006*\"ei\" + 0.005*\"ey\" + 0.003*\"gk\" + 0.003*\"um\"'),\n",
       " (1,\n",
       "  u'0.026*\"edu\" + 0.015*\"mail\" + 0.014*\"com\" + 0.009*\"available\" + 0.009*\"ftp\" + 0.009*\"send\" + 0.008*\"information\" + 0.008*\"email\" + 0.008*\"list\" + 0.006*\"pub\"'),\n",
       " (2,\n",
       "  u'0.006*\"believe\" + 0.005*\"point\" + 0.005*\"evidence\" + 0.005*\"question\" + 0.004*\"israel\" + 0.004*\"religion\" + 0.004*\"true\" + 0.004*\"things\" + 0.004*\"fact\" + 0.003*\"reason\"'),\n",
       " (3,\n",
       "  u'0.011*\"windows\" + 0.009*\"dos\" + 0.008*\"file\" + 0.007*\"image\" + 0.007*\"bit\" + 0.006*\"drive\" + 0.006*\"program\" + 0.006*\"card\" + 0.005*\"disk\" + 0.005*\"software\"'),\n",
       " (4,\n",
       "  u'0.011*\"space\" + 0.005*\"program\" + 0.005*\"high\" + 0.005*\"research\" + 0.004*\"earth\" + 0.004*\"health\" + 0.004*\"data\" + 0.004*\"center\" + 0.004*\"year\" + 0.004*\"nasa\"'),\n",
       " (5,\n",
       "  u'0.049*\"god\" + 0.014*\"jesus\" + 0.012*\"bible\" + 0.011*\"church\" + 0.010*\"christ\" + 0.010*\"christian\" + 0.008*\"man\" + 0.008*\"lord\" + 0.007*\"christians\" + 0.006*\"sin\"'),\n",
       " (6,\n",
       "  u'0.006*\"going\" + 0.006*\"ll\" + 0.006*\"car\" + 0.006*\"got\" + 0.006*\"said\" + 0.004*\"little\" + 0.004*\"thing\" + 0.004*\"old\" + 0.004*\"lot\" + 0.004*\"sure\"'),\n",
       " (7,\n",
       "  u'0.012*\"government\" + 0.009*\"key\" + 0.007*\"law\" + 0.006*\"public\" + 0.005*\"president\" + 0.005*\"mr\" + 0.005*\"war\" + 0.005*\"gun\" + 0.005*\"state\" + 0.005*\"armenian\"'),\n",
       " (8,\n",
       "  u'0.019*\"game\" + 0.016*\"team\" + 0.013*\"games\" + 0.012*\"year\" + 0.009*\"season\" + 0.009*\"hockey\" + 0.009*\"play\" + 0.007*\"league\" + 0.007*\"players\" + 0.006*\"st\"'),\n",
       " (9,\n",
       "  u'0.014*\"db\" + 0.010*\"st\" + 0.010*\"mm\" + 0.008*\"ma\" + 0.008*\"hz\" + 0.007*\"mp\" + 0.007*\"md\" + 0.007*\"ra\" + 0.007*\"mv\" + 0.007*\"lc\"')]"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
